In [1]:
# # Maximum Number of Threads
# import os
# os.environ['NUMEXPR_MAX_THREADS'] = '24'

# Text
from colorama import Fore, Back, Style

# Computations
import numpy as np
import pandas as pd

# statsmodels
import statsmodels.formula.api as smf
import statsmodels.api as sm

# pymc3
from pymc3 import Model, sample, Normal, HalfCauchy, Uniform
from pymc3 import Deterministic, model_to_graphviz, forestplot, plot_posterior

# preprocessing
from sklearn import preprocessing

# VSIualisation libraries

## seaborn
import seaborn as sns
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("whitegrid")

## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline

## missingno
import missingno as msno

## plotly
from plotly.offline import init_notebook_mode, iplot 
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
# Graphics in retina format 
%config InlineBackend.figure_format = 'retina' 

## arviz
import arviz as az

import warnings
warnings.filterwarnings("ignore")
Mercari Price Suggestion

In this article, we analyze and develop models using Bayesian Methods and PyMC3 for the Mercari Price Suggestion Challenge.

Description from Kaggle page:

Mercari, Japan’s biggest community-powered shopping app, knows this problem deeply. They’d like to offer pricing suggestions to sellers, but this is tough because their sellers are enabled to put just about anything, or any bundle of things, on Mercari's marketplace.

Table of Contents

The files consist of a list of product listings. These files are tab-delimited.

Columns Description
train_id or test_id the id of the listing
name the title of the listing. Note that we have cleaned the data to remove text that look like prices (e.g. \$20) to avoid leakage.
These removed prices are represented as (rm)
item_condition_id the condition of the items provided by the seller
category_name category of the listing
brand_name
price the price that the item was sold for. This is the target variable that you will predict. The unit is USD.
This column doesn't exist in test.tsv since that is what you will predict.
shipping 1 if shipping fee is paid by seller and 0 by buyer
item_description the full description of the item. Note that we have cleaned the data to remove text that look like prices (e.g. \$20) to avoid leakage.
These removed prices are represented as (rm)

We will model price for all categories. Our estimate of the parameter of product price can be considered a prediction. This article is inspired by A Primer on Bayesian Methods for Multilevel Modeling.

Loading the Dataset

In [2]:
Data = pd.read_csv('mercari-price-suggestion-challenge/train.tsv', sep = '\t')
Data.columns = [x.title().replace('Id','ID') for x in Data.columns]
Data.rename(columns = {'Shipping':'Shipping Fee'}, inplace = True)
def Data_info(Inp, Only_NaN = False):
    Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
    Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
    Out['Size'] = Inp.shape[0]
    Out['Percentage'] = np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
    if Only_NaN:
        Out = Out.loc[Out['Number of NaN Values']>0]
    return Out

display(Data_info(Data))
_ = msno.bar(Data, figsize=(6,4), fontsize=14, log=False, color="#34495e")
Data Type Number of NaN Values Size Percentage
Brand_Name object 632682 1482535 42.68
Category_Name object 6327 1482535 0.43
Item_Condition_ID int64 0 1482535 0.00
Item_Description object 4 1482535 0.00
Name object 0 1482535 0.00
Price float64 0 1482535 0.00
Shipping Fee int64 0 1482535 0.00
Train_ID int64 0 1482535 0.00
In [3]:
Temp = Data.groupby(['Category_Name', 'Shipping Fee'])['Category_Name'].agg({'count'}).\
            rename(columns = {'count':'Count'}).reset_index(drop = False)
Temp[['Shipping Fee']] = Temp[['Shipping Fee']].applymap(lambda x: 'Seller' if x == 1 else 'Buyer')
Temp.reset_index(drop = False, inplace = True)
Top = 20
Temp0 = Data.Category_Name.value_counts()[:Top].index.tolist()
Temp = Temp.loc[Temp.Category_Name.isin(Temp0)]
del Temp0

C = ['greenyellow', 'seagreen']; SC = 'DarkGreen'
fig = px.bar(Temp, y= 'Category_Name', x= 'Count', orientation='h',
             color = 'Shipping Fee', text = 'Count', color_discrete_sequence= C)
fig.update_traces(marker_line_color=SC, marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 7e4])
fig.update_layout(xaxis = dict(tickmode = 'array', tickvals = [int(x*10e3) for x in np.arange(0,8)]))
fig.update_layout(title = 'Top %i Categories' % Top, plot_bgcolor= 'white', yaxis_title= 'Category Name', height = 600)
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
In [4]:
Temp = Data.groupby(['Category_Name', 'Shipping Fee'])['Price'].agg({'mean'}).\
            rename(columns = {'mean':'Average Price'}).reset_index(drop = False)
Temp[['Shipping Fee']] = Temp[['Shipping Fee']].applymap(lambda x: 'Seller' if x == 1 else 'Buyer')
Temp.reset_index(drop = False, inplace = True)
Top = 20
Temp0 = Data.Category_Name.value_counts()[:Top].index.tolist()
Temp = Temp.loc[Temp.Category_Name.isin(Temp0)]
Temp = Temp.round(2)
del Temp0

C = ['MistyRose', 'FireBrick']; SC = 'DarkRed'
fig = px.bar(Temp, y= 'Category_Name', x= 'Average Price', orientation='h',
             color = 'Shipping Fee', text = 'Average Price', color_discrete_sequence= C)
fig.update_traces(marker_line_color=SC, marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 150])
fig.update_layout(title = 'Top %i Categories' % Top, plot_bgcolor= 'white', yaxis_title= 'Category Name', height = 600)
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
In [5]:
Temp = Data.groupby(['Category_Name', 'Shipping Fee'])['Price'].agg({'sum'}).\
            rename(columns = {'sum':'Total'}).reset_index(drop = False)
Temp[['Shipping Fee']] = Temp[['Shipping Fee']].applymap(lambda x: 'Seller' if x == 1 else 'Buyer')
Temp.reset_index(drop = False, inplace = True)
Top = 20
Temp0 = Data.Category_Name.value_counts()[:Top].index.tolist()
Temp = Temp.loc[Temp.Category_Name.isin(Temp0)]
Temp = Temp.round(2)
del Temp0

C = ['Azure', 'RoyalBlue']; SC = 'DarkBlue'
fig = px.bar(Temp, y= 'Category_Name', x= 'Total', orientation='h',
             color = 'Shipping Fee', text = 'Total', color_discrete_sequence= C)
fig.update_traces(marker_line_color=SC, marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 2.5e6])
fig.update_layout(title = 'Top %i Categories' % Top, plot_bgcolor= 'white', yaxis_title= 'Category Name', height = 600)
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Preprocessing

In [6]:
Data.dropna(subset = ['Category_Name'], inplace = True)
# a random sample of the dataset
Data = Data.sample(frac=0.01, random_state=99)
display(Data_info(Data))
_ = msno.bar(Data, figsize=(6,4), fontsize=14, log=False, color="#34495e")
Data Type Number of NaN Values Size Percentage
Brand_Name object 6310 14762 42.74
Category_Name object 0 14762 0.00
Item_Condition_ID int64 0 14762 0.00
Item_Description object 0 14762 0.00
Name object 0 14762 0.00
Price float64 0 14762 0.00
Shipping Fee int64 0 14762 0.00
Train_ID int64 0 14762 0.00
In [7]:
Temp = Data[['Price', 'Shipping Fee']]
Temp[['Shipping Fee']] = Temp[['Shipping Fee']].applymap(lambda x: 'Seller' if x == 0 else 'Buyer')

C = ['aquamarine', 'steelblue']; SC = 'Navy'
fig = px.histogram(Temp, x = 'Price', color = 'Shipping Fee', marginal ="box", histnorm ='probability density',
                   color_discrete_sequence = C,
                   title = 'Probability Density (Price)')
fig['layout']['xaxis'].update(range=[0, 100])
fig['layout']['yaxis'].update(range=[0, .1])
fig.update_layout(plot_bgcolor= 'WhiteSmoke')
fig.update_traces(marker_line_color= SC, marker_line_width=1.0, opacity=1)
fig.show()

le = preprocessing.LabelEncoder()
Data['Category'] = le.fit_transform(Data['Category_Name'])
Data['Price_Log'] = Data['Price'].apply(lambda x: np.log(x+0.1))

Category_df = Data.loc[Data.index.isin(Data['Category'].drop_duplicates().index),
                       ['Category_Name','Category']].sort_values(by=['Category']).reset_index(drop = True)

Temp = Data[['Price_Log', 'Shipping Fee']]
Temp[['Shipping Fee']] = Temp[['Shipping Fee']].applymap(lambda x: 'Seller' if x == 0 else 'Buyer')

C = ['aquamarine', 'steelblue']; SC = 'Navy'
fig = px.histogram(Temp, x = 'Price_Log', color = 'Shipping Fee', marginal ="box", histnorm ='probability density',
                   color_discrete_sequence = C,
                   title = 'Probability Density (Price (Logarithm))')
fig['layout']['xaxis'].update(range=[-4, 8])
fig['layout']['yaxis'].update(range=[0, 3])
fig.update_layout(plot_bgcolor= 'WhiteSmoke')
fig.update_traces(marker_line_color= SC, marker_line_width=1.0, opacity=1)
fig.show()

Modeling

In this section, we would like to do a panel data analysis in which data are often collected over time and the same individuals. Then a regression is run over these two dimensions [1].

A common panel data regression model: $$y_{it}= \alpha + \beta x_{it}+\varepsilon _{it},$$ where

  • $y$: the dependent variable,
  • $x$: the independent variable,
  • $\alpha$ and $\beta$: coefficients,
  • $i$ and $t$: indices for individuals and time,
  • $\varepsilon _{it}$: error term.
    • In Fixed effects model: $\varepsilon _{it}$ is assumed to vary non-stochastically over $i$ or $t$.
    • In Random effects model: $\varepsilon _{it}$ is assumed to vary stochastically over $i$ or $t$.

For more details, please see [1, 2].

Useful functions throughout the article:

In [8]:
def df_search(df, key): return [s for s in df.columns if key in s]
def list_search(List, key): return [s for s in List if key in s]

LaTex = {'alpha': r'$\alpha$', 'beta': r'$\beta$', 'sigma': r'$\sigma$',
         'sigma_alpha': r'$\sigma_{\alpha}$', 'sigma_beta': r'$\sigma_{\beta}$', 'sigma_y': r'$\sigma_{y}$',
         'mu_alpha': r'$\mu_{\alpha}$', 'mu_beta': r'$\mu_{\beta}$'}

Conventional Approaches

Complete Pooling

In complete pooling, we combine all the information from all the categories into a single pool of data. Thus, treat all cathegoris the same, and estimate a single price level. $$y_{i}= \alpha + \beta x_{i}+\varepsilon _{i},$$

  • $y_i$: the logarithm of the price in product $i$,
  • $\alpha$: Original price across all products in the data,
  • $\beta$: Effect on measured $\log$ (price),
  • $\epsilon_i$: the error term.

However, a problem with this approach is the level might be different for different categories.

The model can be defind with the following considersations: \begin{align} \begin{cases} \alpha~\sim~N\left(0,\sigma^2\right),\\ \beta~\sim~N\left(0,\sigma^2\right),\\ \end{cases} \end{align}

where $\sigma$ has a half-Cauchy distribution. A half-Cauchy is one of the symmetric halves of the Cauchy distribution (if it is unspecified, the right half that's intended)

Cauchy Probability Density Function: $$\text{Cauchy}(y|\mu,\sigma) = \frac{1}{\pi \sigma} \ \frac{1}{1 + \left((y - \mu)/\sigma\right)^2}.$$

Moreover, let

Parameter Description
x Shipping Fee
y Price (Log)

To implement this, we use PyMC3.

In [9]:
x = Data['Shipping Fee'].values

with Model() as pooled_model:
    
    alpha = Normal('alpha', mu = 0, sigma = 1e5, shape=1)
    beta = Normal('beta', mu = 0, sigma = 1e5, shape=1)
    sigma = HalfCauchy('sigma', 5)

    theta = alpha + beta*x

    y = Normal('y', theta, sigma=sigma, observed = Data['Price_Log'].values)

del x
display(model_to_graphviz(pooled_model))
%3 cluster14,762 14,762 alpha alpha ~ Normal y y ~ Normal alpha->y sigma sigma ~ HalfCauchy sigma->y beta beta ~ Normal beta->y
In [10]:
# Sampling
with pooled_model:
    pooled_trace = sample(1000, tune=1000)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma, beta, alpha]
Sampling 4 chains, 0 divergences: 100%|███████████████████████████████████████| 8000/8000 [00:05<00:00, 1367.76draws/s]

Complete Pooling Parameters

In [11]:
# Complete Pool Parameters
CP_Parm = pd.DataFrame(pooled_trace['alpha'], columns = ['alpha'])
CP_Parm['beta'] = pooled_trace['beta']

# Figures
fig, ax = plt.subplots(1, 2, figsize=(16, 6.5))
ax = ax.ravel()

_ = ax[0].scatter(x= CP_Parm.index, y= CP_Parm.alpha, color = 'SkyBlue', edgecolors = 'Navy', label = LaTex['alpha'])
_ = ax[0].hlines(CP_Parm.alpha.mean(), -1, CP_Parm.shape[0]+1, linestyles='-',
                 color = 'OrangeRed', lw = 3, label = LaTex['mu_alpha'])
_ = ax[0].legend(bbox_to_anchor=(1, 1), fontsize = 14)
_ = ax[0].grid(True)
_ = ax[0].set_xlim([-1, CP_Parm.shape[0]+1])
_ = ax[0].set_ylim([3.06, 3.16])

_ = ax[1].scatter(x= CP_Parm.index, y= CP_Parm.beta, color = 'LimeGreen', edgecolors = 'DarkGreen', label= LaTex['beta'])
_ = ax[1].hlines(CP_Parm.beta.mean(), -1, CP_Parm.shape[0]+1, linestyles='-',
                 color = 'Navy', lw = 3, label = LaTex['mu_beta'])
_ = ax[1].legend(bbox_to_anchor=(1, 1), fontsize = 14)
_ = ax[1].grid(True)
_ = ax[1].set_xlim([-1, CP_Parm.shape[0]+1])
_ = ax[1].set_ylim([-.48, -.36])
In [12]:
Vars = ['alpha', 'beta', 'sigma']
ax = az.plot_trace(pooled_trace, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
    _ = ax[i].set_title(Temp[i], fontsize = 16)
In [13]:
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'alpha = %.4f, beta = %.4f' % (CP_Parm.mean().values[0],
                                                                             CP_Parm.mean().values[1]))
t = np.linspace(-0.2, 1.2)
CP_fun = lambda x: CP_Parm.alpha.mean() + CP_Parm.beta.mean() *t

fig = go.Figure()

fig.add_trace(go.Scatter(x= Data['Shipping Fee'].values, y = Data['Price_Log'].values, mode='markers',
                         name='Shipping Fee - Price (Log)'))
fig.add_trace(go.Scatter(x= t, y= CP_fun(t), mode='lines', name='Fitted Model'))
fig.update_layout(title = 'Fitted Model', plot_bgcolor= 'WhiteSmoke',
                  xaxis_title= 'Shipping Fee', yaxis_title= 'Price (Log)', height = 500, width = 650)
fig['layout']['yaxis'].update(range=[-4, 8])
fig.update_layout(xaxis = dict(tickmode = 'array', tickvals = [0,1], ticktext = ['Buyer', 'Seller']))
fig.show()
del t
alpha = 3.1051, beta = -0.4195

Note that this is similar to a Linear Regression model (Check this article for more details regarding Linear Regression model).

In [14]:
Temp = Data[['Price_Log', 'Shipping Fee']]
Temp.columns = ['Price_Log', 'Shipping_Fee']
Results = smf.ols('Price_Log ~ Shipping_Fee', Temp).fit()
Results.summary().tables[1]
Out[14]:
coef std err t P>|t| [0.025 0.975]
Intercept 3.1050 0.009 359.542 0.000 3.088 3.122
Shipping_Fee -0.4196 0.013 -32.717 0.000 -0.445 -0.394

Here, Intercept is the same as $\alpha$ and the coefficient of Shipping Fee is the same as $\beta$.

No Pooling (Unpooled Model)

We assume that there is no connection at all between the Price (Log) levels in the different categories. In other words, we model price in each category independently. That is

$$y_{i}= \alpha_{j[i]} + \beta x_{i}+\epsilon_i,$$
  • $y_i$: the logarithm of the price in product $i$,
  • $\alpha_{j[i]}$: original price in category $j[i]$ for product $i$ and category $j$,
  • $\beta$: Effect on measured $\log$ (price),
  • $\epsilon_i$: the error term.

The model can be defind with the following considersations: \begin{align} &\epsilon_i \sim N(0, \sigma_y^2)\\ &\alpha_{j[i]}, \beta \sim N(\mu, \sigma^2) \end{align}

In [15]:
x = Data['Shipping Fee'].values
Categories = len(np.sort(Data['Category'].unique()))
Category = Data['Category']

with Model() as unpooled_model:

    alpha = Normal('alpha', mu = 0, sigma = 1e5, shape= Categories)
    beta = Normal('beta', mu = 0, sigma = 1e5)
    sigma = HalfCauchy('sigma', 5)

    theta = alpha[Category] + beta*x

    y = Normal('y', theta, sigma=sigma, observed= Data['Price_Log'].values)
    
display(model_to_graphviz(unpooled_model))
%3 cluster708 708 cluster14,762 14,762 alpha alpha ~ Normal y y ~ Normal alpha->y sigma sigma ~ HalfCauchy sigma->y beta beta ~ Normal beta->y
In [16]:
# Sampling
with unpooled_model:
    unpooled_trace = sample(1000, tune=1000)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma, beta, alpha]
Sampling 4 chains, 0 divergences: 100%|█████████████████████████████████████████| 8000/8000 [01:21<00:00, 97.90draws/s]
In [17]:
# Un-pooled (No Pool) Parmameters
NP_Parm = pd.DataFrame(unpooled_trace['alpha'], columns = ['alpha%i' %i for i in range(len(Data['Category'].unique()))])
NP_Parm['beta'] = unpooled_trace['beta']
In [18]:
fig, ax = plt.subplots(1, 1, figsize=(16, 6.5))

Temp = NP_Parm.copy()
Temp = Temp[df_search(Temp, 'beta')]

_ = ax.scatter(x= Temp.index, y= Temp['beta'], color = 'SkyBlue', edgecolors = 'Navy')
_ = ax.hlines(Temp.mean(), -1, len(Temp.index)+1, linestyles='-',
              color = 'OrangeRed', lw = 3, label = LaTex['mu_alpha'])
_ = ax.set_title(r'$\beta$', fontsize = 16)
_ = ax.grid(True)
_ = ax.set_xlim([-1, len(NP_Parm.index)+1])
_ = ax.set_ylim([-.36, -.24])
_ = ax.legend(bbox_to_anchor=(1, 1), fontsize = 14)
del Temp
In [19]:
Vars = ['alpha', 'beta', 'sigma']
Vars = sorted(Vars)
ax = az.plot_trace(unpooled_trace, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
    _ = ax[i].set_title(Temp[i], fontsize = 16)
In [20]:
if len(Vars)%2 ==1:
    fig, ax = plt.subplots(int(np.floor(len(Vars)/2) + 1), 2, figsize = (14, 2*len(Vars)))
else:
    fig, ax = plt.subplots(int(np.floor(len(Vars)/2)), 2, figsize = (14, 2*len(Vars)))
    
_ = az.plot_posterior(unpooled_trace, var_names=Vars, point_estimate = 'mean', ax = ax)

Temp = [LaTex[x] for x in Vars]

ax = ax.ravel()
for i in range(len(Vars)):
    _ = ax[i].set_title(Temp[i], fontsize = 16)

if (len(Vars)< len(ax)):
    for i in np.arange(len(Vars), len(ax)):
        ax[i].set_axis_off()
        
plt.subplots_adjust(hspace = (0.2)* np.floor(len(Vars)/2))

A plot of the ordered Estimates:

In [21]:
Temp = NP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))

# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
    _ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5) 
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([-1, 8])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in Estimated Price by Category', fontsize = 18)

# Bottom
Temp = NP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
n = 50

_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([-1, 8])
_ = ax[1].set_title('Estimated Price Distributions with respect to the Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp
In [22]:
fig, ax = plt.subplots(figsize=(7, 6))
t = np.arange(2)
Temp = NP_Parm.copy()
beta = Temp['beta'].mean()
Temp = Temp[df_search(Temp, 'alpha')]

for alpha in Temp.mean(axis=0):
    ax.plot(t, alpha + beta*t, color='DarkBlue', marker='o', linestyle='--',
            linewidth=1, markersize=4, alpha=0.4)
delta = 0.1
_ = ax.set_xlim([t[0]-delta, t[1]+delta])
_ = ax.set_ylim([1, 5])
_ = ax.set_xticks([0, 1])
_ = ax.set_xticklabels(['Buyer', 'Seller'])
_ = ax.set_title('Fitted Relationships by Category')
_ = ax.set_xlabel("Shipping Fee by")
_ = ax.set_ylabel(' Estimated Price (Logarithm Scale)')
del alpha, beta, delta

A Visual comparisons between the pooled and unpooled estimates for a subset of categories:

In [23]:
Temp = NP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})

s = 4
fig, ax = plt.subplots(4, 3, figsize=(14, 14))
ax = ax.ravel()

lines = []
labels = []
for i, c in enumerate(Temp.Categories[:12]):
    y = Data.loc[Data.Category_Name==c, 'Price_Log'].values
    x = Data.loc[Data.Category_Name==c, 'Shipping Fee'].values
    NP_fun = lambda t: Temp.loc[Temp.Categories == c, 'mean'].values[0] + t*NP_Parm['beta'].mean()
    
    ax[i].scatter(x + np.random.randn(len(x))*0.01, y, alpha=0.4)
    t = np.linspace(-0.2, 1.2)
    ax[i].plot(t, NP_fun(t), 'k', lw = 2, label = 'Complete Pooling')
    ax[i].plot(t, CP_fun(t), 'r', lw = 2, label = 'No Pooling')
    ax[i].set_xticks([0,1])
    ax[i].set_title(c)
    ax[i].set_ylim([0, 8])
    ax[i].set_xlabel('Shipping Fee')
    ax[i].set_ylabel('Price (Log)')
    
    axLine, axLabel = ax[i].get_legend_handles_labels()
    lines.extend(axLine)
    labels.extend(axLabel)
    del axLine, axLabel
    
fig.legend(lines[:2], labels[:2], bbox_to_anchor=(0.2, 1), loc='lower left',
           ncol=2, title = 'Fitted Model', mode="expand", borderaxespad=0 , fontsize = 14)

plt.tight_layout()

Multilevel and Hierarchical models

There are Few approaches that we discuss here. For example,

  • Ignoring the effect of shipping, no $\beta$,
  • Defining a model that allows intercepts to vary across the category,
  • Defining a model that allows slopes to vary across the category,
  • Defining a model that allows both intercepts and slopes to vary across the category.

Partial Pooling - the Simplest

The simplest possible partial pooling model for the retail price dataset is one that simply estimates prices, $\alpha$, with no other predictors and ignoring the effect of shipping, $\beta$.

In doing so, let, $$y_i = \alpha_{j[i]} + \epsilon_i$$

where \begin{align} &\epsilon_i \sim N(0, \sigma_y^2)\\ &\alpha_{j[i]} \sim N(\mu_{\alpha}, \sigma_{\alpha}^2) \end{align}

In [24]:
x = Data['Shipping Fee'].values
Categories = len(np.sort(Data['Category'].unique()))
Category = Data['Category']

with Model() as partial_pooling:

    # Priors
    mu_alpha = Normal('mu_alpha', mu = 0 , sigma = 1e5)
    sigma_alpha = HalfCauchy('sigma_alpha', 5)

    # Random intercepts
    alpha = Normal('alpha', mu=mu_alpha, sigma=sigma_alpha, shape = Categories)

    # Model error
    sigma_y = HalfCauchy('sigma_y',5)

    # Expected value
    y_hat = alpha[Category]

    # Data likelihood
    y_like = Normal('y_like', mu=y_hat, sigma=sigma_y, observed = Data['Price_Log'].values)

display(model_to_graphviz(partial_pooling))
%3 cluster708 708 cluster14,762 14,762 mu_alpha mu_alpha ~ Normal alpha alpha ~ Normal mu_alpha->alpha sigma_alpha sigma_alpha ~ HalfCauchy sigma_alpha->alpha sigma_y sigma_y ~ HalfCauchy y_like y_like ~ Normal sigma_y->y_like alpha->y_like
In [25]:
with partial_pooling:
    partial_pooling_trace = sample(1000, tune=1000)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma_y, alpha, sigma_alpha, mu_alpha]
Sampling 4 chains, 0 divergences: 100%|████████████████████████████████████████| 8000/8000 [01:02<00:00, 129.01draws/s]
In [26]:
# Partial Pooling Parmameters
PP_Parm = pd.DataFrame(partial_pooling_trace['alpha'], columns = ['alpha%i' %i for i in range(len(Data['Category'].unique()))])
In [27]:
Vars = ['sigma_y', 'alpha', 'sigma_alpha', 'mu_alpha']
Vars = sorted(Vars)
ax = az.plot_trace(partial_pooling_trace, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
    _ = ax[i].set_title(Temp[i], fontsize = 16)
In [28]:
if len(Vars)%2 ==1:
    fig, ax = plt.subplots(int(np.floor(len(Vars)/2) + 1), 2, figsize = (14, 2*len(Vars)))
else:
    fig, ax = plt.subplots(int(np.floor(len(Vars)/2)), 2, figsize = (14, 2*len(Vars)))
    
_ = az.plot_posterior(partial_pooling_trace, var_names=Vars, point_estimate = 'mean', ax = ax)

Temp = [LaTex[x] for x in Vars]

ax = ax.ravel()
for i in range(len(Vars)):
    _ = ax[i].set_title(Temp[i], fontsize = 16)

if (len(Vars)< len(ax)):
    for i in np.arange(len(Vars), len(ax)):
        ax[i].set_axis_off()
        
plt.subplots_adjust(hspace = (0.2)* np.floor(len(Vars)/2))
In [29]:
Temp = PP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))

# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
    _ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5) 
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([1, 5])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in Estimated Price by Category', fontsize = 18)

# Bottom
Temp = PP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
n = 50

_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([0, 6])
_ = ax[1].set_title('Estimated Price Distributions with respect to the Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp

Standard Error $$SE = \frac{\sigma} {\sqrt{n}}$$

  • $\text{SE}$ = standard error of the sample
  • $\sigma$ = sample standard deviation
  • $n$ = number of samples
In [30]:
Temp = PP_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]

Group = pd.DataFrame({'n': Data.groupby('Category_Name')['Train_ID'].count(),
                         'mean': Data.groupby('Category_Name')['Price_Log'].mean(),
                         'sd': Data.groupby('Category_Name')['Price_Log'].std()})
Group['se'] = Group['sd']/np.sqrt(Group['n'])

# deviations
jitter = np.random.normal(scale=0.1, size= Temp.shape[1])

fig, ax = plt.subplots(1, 2, figsize=(16,8))
_ = ax[0].scatter(Group['n'] + jitter, Group['mean'], color = 'SkyBlue', edgecolors = 'Navy')
for j, row in zip(jitter, Group.iterrows()):
    n, mean, se = n, mean, se = row[1]['n'], row[1]['mean'], row[1]['se']
    _ = ax[0].plot([n + j, n + j], [mean - se, mean + se], 'k-', alpha = 0.5)
_ = ax[0].set_xscale('log')
_ = ax[0].hlines(Temp.values.mean(), 0, 1000, linestyles='--', color = 'red', lw = 2)
_ = ax[0].set_ylim([0,6])
_ = ax[0].set_xlim([0,1e3])
_ = ax[0].set_title('No Pool Model Estimates Mean')

_ = ax[1].scatter(Group['n'] + jitter, Temp.mean(axis=0), color = 'SkyBlue', edgecolors = 'Navy')
_ = ax[1].hlines(Temp.values.mean(), 0, 1000, linestyles='--', color = 'red', lw = 2)
_ = ax[1].set_xscale('log')
_ = ax[1].set_xlim([0,1e3])
_ = ax[1].set_ylim([0,6])
_ = ax[1].set_title('Partial Pool Model Estimates Mean')
for j, n , mean , std in zip(jitter, Group['n'].values, Temp.mean(axis=0), Temp.std(axis=0)):
    _ = ax[1].plot([n+j, n+j], [mean- std, mean + std], 'k-', alpha = 0.5)
    
del j, n, mean, std, Temp, Group

The unpooled estimates are more imprecise than the partial pool estimates.

Partial Pooling - Varying Intercept

This model allows intercepts to vary across the category, according to a random effect.

$$y_i = \alpha_{j[i]} + \beta x_{i} + \epsilon_i$$

where \begin{align} &\epsilon_i \sim N(0, \sigma_y^2)\\ &\alpha_{j[i]} \sim N(\mu_{\alpha}, \sigma_{\alpha}^2) \end{align}

In [31]:
x = Data['Shipping Fee'].values
Categories = len(np.sort(Data['Category'].unique()))
Category = Data['Category']

with Model() as varying_intercept:

    # Priors
    mu_alpha = Normal('mu_alpha', mu = 0 , tau=1e-4)
    sigma_alpha = HalfCauchy('sigma_alpha', 5)

    # Random intercepts
    alpha = Normal('alpha', mu=mu_alpha, sigma=sigma_alpha, shape = Categories)
    # Common slope
    beta = Normal('beta', mu = 0 , sigma = 1e5)

    # Model error
    sigma_y = HalfCauchy('sigma_y',5)

    # Expected value
    y_hat = alpha[Category] + beta * x

    # Data likelihood
    y_like = Normal('y_like', mu=y_hat, sigma=sigma_y, observed = Data['Price_Log'].values)

display(model_to_graphviz(varying_intercept))
%3 cluster708 708 cluster14,762 14,762 mu_alpha mu_alpha ~ Normal alpha alpha ~ Normal mu_alpha->alpha sigma_alpha sigma_alpha ~ HalfCauchy sigma_alpha->alpha beta beta ~ Normal y_like y_like ~ Normal beta->y_like sigma_y sigma_y ~ HalfCauchy sigma_y->y_like alpha->y_like
In [32]:
with varying_intercept:
    varying_intercept_trace = sample(1000, tune=1000)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma_y, beta, alpha, sigma_alpha, mu_alpha]
Sampling 4 chains, 0 divergences: 100%|████████████████████████████████████████| 8000/8000 [01:04<00:00, 125.00draws/s]
In [33]:
# Varying Intercept Parmameters
VI_Parm = pd.DataFrame(varying_intercept_trace['alpha'],
                       columns = ['alpha%i' %i for i in range(len(Data['Category'].unique()))])

VI_Parm['beta'] = varying_intercept_trace['beta']
In [34]:
Vars = ['sigma_y', 'beta', 'alpha', 'sigma_alpha', 'mu_alpha']
Vars = sorted(Vars)
ax = az.plot_trace(varying_intercept_trace, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
    _ = ax[i].set_title(Temp[i], fontsize = 16)
In [35]:
if len(Vars)%2 ==1:
    fig, ax = plt.subplots(int(np.floor(len(Vars)/2) + 1), 2, figsize = (14, 2*len(Vars)))
else:
    fig, ax = plt.subplots(int(np.floor(len(Vars)/2)), 2, figsize = (14, 2*len(Vars)))
    
_ = az.plot_posterior(varying_intercept_trace, var_names=Vars, point_estimate = 'mean', ax = ax)

Temp = [LaTex[x] for x in Vars]

ax = ax.ravel()
for i in range(len(Vars)):
    _ = ax[i].set_title(Temp[i], fontsize = 16)

if (len(Vars)< len(ax)):
    for i in np.arange(len(Vars), len(ax)):
        ax[i].set_axis_off()
        
plt.subplots_adjust(hspace = (0.2)* np.floor(len(Vars)/2))
In [36]:
Temp = VI_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))

# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
    _ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5) 
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([1, 6])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in Estimated Price by Category', fontsize = 18)

# Bottom
Temp = VI_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
n = 50

_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([0, 6])
_ = ax[1].set_title('Estimated Price Distributions with respect to the Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp
In [37]:
fig, ax = plt.subplots(figsize=(7, 6))
t = np.arange(2)
Temp = VI_Parm.copy()

for alpha in Temp[df_search(Temp, 'alpha')].mean(axis=0):
    ax.plot(t, alpha + Temp['beta'].mean()*t, color='DarkBlue', marker='o', linestyle='--',
            linewidth=1, markersize=4, alpha=0.4)
delta = 0.1
_ = ax.set_xlim([t[0]-delta, t[1]+delta])
_ = ax.set_ylim([1, 5])
_ = ax.set_xticks([0, 1])
_ = ax.set_xticklabels(['Buyer', 'Seller'])
_ = ax.set_title('Fitted Relationships by Category')
_ = ax.set_xlabel("Shipping Fee by")
_ = ax.set_ylabel(' Estimated Price (Logarithm Scale)')
del alpha

Prediction

In [38]:
Temp = list_search(Category_df.iloc[:,0].to_list(), 'Styling Products')[0]
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Selected Category:' + Style.RESET_ALL + ' %s' % Temp)
Selected_Category = Category_df.loc[Category_df.Category_Name == Temp, 'Category'].values[0]
del Temp
Selected Category: Beauty/Hair Care/Styling Products
In [39]:
x = Data['Shipping Fee'].values
Categories = len(np.sort(Data['Category'].unique()))
Category = Data['Category']

with Model() as varying_intercept_pred:

    # Priors
    mu_alpha = Normal('mu_alpha', mu = 0 , tau=1e-4)
    sigma_alpha = HalfCauchy('sigma_alpha', 5)

    # Random intercepts
    alpha = Normal('alpha', mu=mu_alpha, sigma=sigma_alpha, shape = Categories)
    # Common slope
    beta = Normal('beta', mu = 0 , sigma = 1e5)

    # Model error
    sigma_y = HalfCauchy('sigma_y',5)

    # Expected value
    y_hat = alpha[Category] + beta * x

    # Data likelihood
    y_like = Normal('y_like', mu=y_hat, sigma=sigma_y, observed = Data['Price_Log'].values)
    
    # prediction
    y_pred = Normal('y_pred', mu=alpha[Selected_Category] + beta, sigma=sigma_y)

display(model_to_graphviz(varying_intercept_pred))
%3 cluster708 708 cluster14,762 14,762 sigma_alpha sigma_alpha ~ HalfCauchy alpha alpha ~ Normal sigma_alpha->alpha beta beta ~ Normal y_pred y_pred ~ Normal beta->y_pred y_like y_like ~ Normal beta->y_like sigma_y sigma_y ~ HalfCauchy sigma_y->y_pred sigma_y->y_like mu_alpha mu_alpha ~ Normal mu_alpha->alpha alpha->y_pred alpha->y_like
In [40]:
with varying_intercept_pred:
    varying_intercept_pred_trace = sample(1000, tune=1000)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [y_pred, sigma_y, beta, alpha, sigma_alpha, mu_alpha]
Sampling 4 chains, 0 divergences: 100%|████████████████████████████████████████| 8000/8000 [01:12<00:00, 109.60draws/s]
In [41]:
_ = az.plot_trace(varying_intercept_pred_trace, var_names = ['y_pred'], compact = True, rug = True , figsize = (14, 4)) 
_ = az.plot_posterior(varying_intercept_pred_trace, var_names = ['y_pred'], point_estimate = 'mean', figsize = (7, 4))
In [42]:
fig, ax = plt.subplots(1, 2, figsize=(16, 6.5))
ax = ax.ravel()

Temp0 = varying_intercept_pred_trace['y_pred']

_ = ax[0].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'SkyBlue', edgecolors = 'Navy')
_ = ax[0].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
                 color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[0].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
                                                        'Category_Name'].values[0], fontsize = 16)
_ = ax[0].grid(True)
_ = ax[0].set_xlim([-1, len(Temp0)+1])
_ = ax[0].set_ylim([-1, 6])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].legend(bbox_to_anchor=(1, 1), fontsize = 14)


Temp0 = np.exp(Temp0)

_ = ax[1].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'MediumSeaGreen', edgecolors = 'Navy')
_ = ax[1].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
                 color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[1].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
                                                        'Category_Name'].values[0], fontsize = 16)
_ = ax[1].grid(True)
_ = ax[1].set_xlim([-1, len(Temp0)+1])
_ = ax[1].set_ylim([0, 160])
_ = ax[1].set_ylabel('Estimated Price')
_ = ax[1].legend(bbox_to_anchor=(1, 1), fontsize = 14)

del Temp0
In [43]:
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Exact:')
display(Data.loc[Data.Category == Selected_Category, ['Price', 'Price_Log']].describe().T)
Exact:
count mean std min 25% 50% 75% max
Price 28.0 17.857143 10.582005 4.000000 8.750000 17.500000 24.000000 40.000000
Price_Log 28.0 2.692309 0.674433 1.410987 2.179172 2.867495 3.182212 3.691376

Partial Pooling - Varying Slope model

An alternative would be

$$y_i = \alpha + \beta_{j[i]} x_{i} + \epsilon_i$$

where \begin{align} \begin{cases} \epsilon_i \sim N(0, \sigma_y^2)\\ \alpha \sim N(0, \sigma^2)\\ \beta_{j[i]} \sim N(\mu_{\beta}, \sigma_{\beta}^2) \end{cases} \end{align}

This model highlights the relationship between measured the logarithm of the price, the prevailing price level, and the effect of who pays the shipping fee at which the measurement was made.

In [44]:
x = Data['Shipping Fee'].values
Categories = len(np.sort(Data['Category'].unique()))
Category = Data['Category']

with Model() as varying_slope:

    # Priors
    mu_beta = Normal('mu_beta', mu = 0 , tau=0.0001)
    sigma_beta = HalfCauchy('sigma_beta', 5)

    # Random intercepts
    alpha = Normal('alpha', mu = 0 , sigma = 1e5)
    # Common slope
    beta = Normal('beta', mu=mu_beta, sigma=sigma_beta, shape = Categories)

    # Model error
    sigma_y = HalfCauchy('sigma_y',5)

    # Expected value
    y_hat = alpha + beta[Category] * x

    # Data likelihood
    y_like = Normal('y_like', mu=y_hat, sigma=sigma_y, observed = Data['Price_Log'].values)

display(model_to_graphviz(varying_slope))
%3 cluster708 708 cluster14,762 14,762 alpha alpha ~ Normal y_like y_like ~ Normal alpha->y_like sigma_beta sigma_beta ~ HalfCauchy beta beta ~ Normal sigma_beta->beta mu_beta mu_beta ~ Normal mu_beta->beta sigma_y sigma_y ~ HalfCauchy sigma_y->y_like beta->y_like
In [45]:
# Sampling
with varying_slope:
    varying_slope_trace = sample(1000, tune=1000)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma_y, beta, alpha, sigma_beta, mu_beta]
Sampling 4 chains, 0 divergences: 100%|█████████████████████████████████████████| 8000/8000 [01:35<00:00, 83.49draws/s]
In [46]:
# Varying Slope Parmameters
VS_Parm = pd.DataFrame(varying_slope_trace['beta'],
                       columns = ['beta%i' %i for i in range(len(Data['Category'].unique()))])

VS_Parm['alpha'] = varying_slope_trace['alpha']
VS_Parm = VS_Parm.reindex(sorted(VS_Parm.columns), axis=1)
In [47]:
Vars = ['sigma_y', 'beta', 'alpha', 'sigma_beta', 'mu_beta']
Vars = sorted(Vars)
ax = az.plot_trace(varying_slope_trace, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
    _ = ax[i].set_title(Temp[i], fontsize = 16)
In [48]:
if len(Vars)%2 ==1:
    fig, ax = plt.subplots(int(np.floor(len(Vars)/2) + 1), 2, figsize = (14, 2*len(Vars)))
else:
    fig, ax = plt.subplots(int(np.floor(len(Vars)/2)), 2, figsize = (14, 2*len(Vars)))
    
_ = az.plot_posterior(varying_slope_trace, var_names=Vars, point_estimate = 'mean', ax = ax)

Temp = [LaTex[x] for x in Vars]

ax = ax.ravel()
for i in range(len(Vars)):
    _ = ax[i].set_title(Temp[i], fontsize = 16)

if (len(Vars)< len(ax)):
    for i in np.arange(len(Vars), len(ax)):
        ax[i].set_axis_off()
        
plt.subplots_adjust(hspace = (0.2)* np.floor(len(Vars)/2))
In [49]:
Temp = VS_Parm.copy()
Temp = Temp[df_search(Temp, 'beta')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))

# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
    _ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5) 
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([-2, 2])
_ = ax[0].set_ylabel('The Effect of the Paid Shipping Fees (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in Estimated Price by Category', fontsize = 18)

# Bottom
Temp = VS_Parm.copy()
Temp = Temp[df_search(Temp, 'beta')]
Temp.columns = Category_df['Category_Name']
n = 50

_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([-3, 3])
_ = ax[1].set_title('The Effect of the Paid Shipping Fees Distributions with respect to Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp
In [50]:
fig, ax = plt.subplots(figsize=(7, 6))
t = np.arange(2)
Temp = VS_Parm.copy()

for beta in Temp[df_search(Temp, 'beta')].mean(axis=0):
    ax.plot(t, Temp['alpha'].mean() + beta*t, color='DarkBlue', marker='o', linestyle='--',
            linewidth=1, markersize=4, alpha=0.4)
delta = 0.1
_ = ax.set_xlim([t[0]-delta, t[1]+delta])
_ = ax.set_ylim([1, 5])
_ = ax.set_xticks([0, 1])
_ = ax.set_xticklabels(['Buyer', 'Seller'])
_ = ax.set_title('Fitted Relationships by Category')
_ = ax.set_xlabel("Shipping Fee by")
_ = ax.set_ylabel(' Estimated Price (Logarithm Scale)')

Prediction

In [51]:
Temp = list_search(Category_df.iloc[:,0].to_list(), 'Styling Products')[0]
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Selected Category:' + Style.RESET_ALL + ' %s' % Temp)
Selected_Category = Category_df.loc[Category_df.Category_Name == Temp, 'Category'].values[0]
del Temp
Selected Category: Beauty/Hair Care/Styling Products
In [52]:
x = Data['Shipping Fee'].values
Categories = len(np.sort(Data['Category'].unique()))
Category = Data['Category']

with Model() as varying_slope_pred:
    
    # Priors
    mu_beta = Normal('mu_beta', mu = 0 , tau=0.0001)
    sigma_beta = HalfCauchy('sigma_beta', 5)

    # Random intercepts
    alpha = Normal('alpha', mu = 0 , sigma = 1e5)
    # Common slope
    beta = Normal('beta', mu=mu_beta, sigma=sigma_beta, shape = Categories)

    # Model error
    sigma_y = HalfCauchy('sigma_y',5)

    # Expected value
    y_hat = alpha + beta[Category] * x

    # Data likelihood
    y_like = Normal('y_like', mu=y_hat, sigma=sigma_y, observed = Data['Price_Log'].values)
    
    # prediction
    y_pred = Normal('y_pred', mu=alpha + beta[Selected_Category], sigma=sigma_y)

display(model_to_graphviz(varying_slope_pred))
%3 cluster708 708 cluster14,762 14,762 sigma_y sigma_y ~ HalfCauchy y_pred y_pred ~ Normal sigma_y->y_pred y_like y_like ~ Normal sigma_y->y_like alpha alpha ~ Normal alpha->y_pred alpha->y_like sigma_beta sigma_beta ~ HalfCauchy beta beta ~ Normal sigma_beta->beta mu_beta mu_beta ~ Normal mu_beta->beta beta->y_pred beta->y_like
In [53]:
with varying_slope_pred:
    varying_slope_pred_trace = sample(1000, tune=1000)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [y_pred, sigma_y, beta, alpha, sigma_beta, mu_beta]
Sampling 4 chains, 0 divergences: 100%|█████████████████████████████████████████| 8000/8000 [01:40<00:00, 79.72draws/s]
In [54]:
_ = az.plot_trace(varying_slope_pred_trace, var_names = ['y_pred'], compact = True, rug = True , figsize = (14, 4)) 
_ = az.plot_posterior(varying_slope_pred_trace, var_names = ['y_pred'], point_estimate = 'mean', figsize = (7, 4))
In [55]:
fig, ax = plt.subplots(1, 2, figsize=(16, 6.5))
ax = ax.ravel()

Temp0 = varying_slope_pred_trace['y_pred']

_ = ax[0].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'SkyBlue', edgecolors = 'Navy')
_ = ax[0].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
                 color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[0].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
                                                        'Category_Name'].values[0], fontsize = 16)
_ = ax[0].grid(True)
_ = ax[0].set_xlim([-1, len(Temp0)+1])
_ = ax[0].set_ylim([-1, 6])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].legend(bbox_to_anchor=(1, 1), fontsize = 14)


Temp0 = np.exp(Temp0)

_ = ax[1].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'MediumSeaGreen', edgecolors = 'Navy')
_ = ax[1].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
                 color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[1].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
                                                        'Category_Name'].values[0], fontsize = 16)
_ = ax[1].grid(True)
_ = ax[1].set_xlim([-1, len(Temp0)+1])
_ = ax[1].set_ylim([0, 160])
_ = ax[1].set_ylabel('Estimated Price')
_ = ax[1].legend(bbox_to_anchor=(1, 1), fontsize = 14)

del Temp0
In [56]:
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Exact:')
display(Data.loc[Data.Category == Selected_Category, ['Price', 'Price_Log']].describe().T)
Exact:
count mean std min 25% 50% 75% max
Price 28.0 17.857143 10.582005 4.000000 8.750000 17.500000 24.000000 40.000000
Price_Log 28.0 2.692309 0.674433 1.410987 2.179172 2.867495 3.182212 3.691376

Partial Pooling - Varying Slope and Intercept

The most general model allows both the intercept and slope to vary by category:

$$y_i = \alpha_{j[i]} + \beta_{j[i]} x_{i} + \epsilon_i$$

where \begin{align} \begin{cases} \epsilon_i \sim N(0, \sigma_y^2),\\ \alpha_{j[i]} \sim N(\mu_{\alpha}, \sigma_{\alpha}^2),\\ \beta_{j[i]} \sim N(\mu_{\beta}, \sigma_{\beta}^2). \end{cases} \end{align}

In [57]:
x = Data['Shipping Fee'].values
Categories = len(np.sort(Data['Category'].unique()))
Category = Data['Category']

with Model() as varying_intercept_slope:

    # Priors
    mu_alpha = Normal('mu_alpha', mu = 0 , sigma = 1e5)
    sigma_alpha = HalfCauchy('sigma_alpha', 5)
    
    mu_beta = Normal('mu_beta', mu = 0 , sigma = 1e5)
    sigma_beta = HalfCauchy('sigma_beta', 5)
    
    # Random intercepts
    alpha = Normal('alpha', mu=mu_alpha, sigma=sigma_alpha, shape = Categories)
    
    # Random slopes
    beta = Normal('beta', mu=mu_beta, sigma=sigma_beta, shape = Categories)

    # Model error
    sigma_y = Uniform('sigma_y', lower=0, upper=100)

    # Expected value
    y_hat = alpha[Category] + beta[Category] * x

    # Data likelihood
    y_like = Normal('y_like', mu=y_hat, sigma=sigma_y, observed = Data['Price_Log'].values)

display(model_to_graphviz(varying_intercept_slope))
%3 cluster708 708 cluster14,762 14,762 sigma_alpha sigma_alpha ~ HalfCauchy alpha alpha ~ Normal sigma_alpha->alpha sigma_y sigma_y ~ Uniform y_like y_like ~ Normal sigma_y->y_like sigma_beta sigma_beta ~ HalfCauchy beta beta ~ Normal sigma_beta->beta mu_beta mu_beta ~ Normal mu_beta->beta mu_alpha mu_alpha ~ Normal mu_alpha->alpha alpha->y_like beta->y_like
In [58]:
# Sampling
with varying_intercept_slope:
    varying_intercept_slope_trace = sample(1000, tune=1000)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma_y, beta, alpha, sigma_beta, mu_beta, sigma_alpha, mu_alpha]
Sampling 4 chains, 0 divergences: 100%|█████████████████████████████████████████| 8000/8000 [03:09<00:00, 42.13draws/s]
The number of effective samples is smaller than 10% for some parameters.
In [59]:
# Varying Slope Parmameters
VSI_Parm = pd.DataFrame(varying_intercept_slope_trace['alpha'],
                       columns = ['alpha%i' %i for i in range(len(Data['Category'].unique()))])

VSI_Parm = pd.concat([VSI_Parm, pd.DataFrame(varying_intercept_slope_trace['beta'],
                       columns = ['beta%i' %i for i in range(len(Data['Category'].unique()))])], axis = 1)
In [60]:
Vars = ['sigma_y', 'beta', 'alpha', 'sigma_beta', 'mu_beta', 'sigma_alpha', 'mu_alpha']
Vars = sorted(Vars)
ax = az.plot_trace(varying_intercept_slope_trace, var_names = Vars, compact = True, rug = True , figsize = (14, 3*len(Vars)))
ax = ax.ravel()
Temp = [LaTex[x] for x in Vars]
Temp = [s for t in Temp for s in [t] * 2]
for i in range(len(ax)):
    _ = ax[i].set_title(Temp[i], fontsize = 16)
In [61]:
if len(Vars)%2 ==1:
    fig, ax = plt.subplots(int(np.floor(len(Vars)/2) + 1), 2, figsize = (14, 2*len(Vars)))
else:
    fig, ax = plt.subplots(int(np.floor(len(Vars)/2)), 2, figsize = (14, 2*len(Vars)))
    
_ = az.plot_posterior(varying_intercept_slope_trace, var_names=Vars, point_estimate = 'mean', ax = ax)

Temp = [LaTex[x] for x in Vars]

ax = ax.ravel()
for i in range(len(Vars)):
    _ = ax[i].set_title(Temp[i], fontsize = 16)

if (len(Vars)< len(ax)):
    for i in np.arange(len(Vars), len(ax)):
        ax[i].set_axis_off()
        
plt.subplots_adjust(hspace = (0.2)* np.floor(len(Vars)/2))
In [62]:
Temp = VSI_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))

# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
    _ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5) 
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([1, 6])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in Estimated Price by Category', fontsize = 18)

# Bottom
Temp = VSI_Parm.copy()
Temp = Temp[df_search(Temp, 'alpha')]
Temp.columns = Category_df['Category_Name']
n = 50

_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([0, 6])
_ = ax[1].set_title('Estimated Price Distributions with respect to the Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp
In [63]:
Temp = VSI_Parm.copy()
Temp = Temp[df_search(Temp, 'beta')]
Temp.columns = Category_df['Category_Name']
Temp = Temp.agg({'mean','std'}).T
Temp = Temp.sort_values(by=['mean']).reset_index(drop = False).rename(columns = {'index':'Categories'})
fig, ax = plt.subplots(2, 1, figsize=(15.5, 14))

# Top
_ = ax[0].scatter(x = Temp.index, y = Temp['mean'].values, color = 'b')
for i, mean, std in zip( Temp.index, Temp['mean'].values, Temp['std'].values):
    _ = ax[0].plot([i,i], [mean - std, mean + std], 'r-', alpha = 0.5) 
_ = ax[0].set_xlim([-1, len(Temp.index) +1])
_ = ax[0].set_ylim([-1, 1])
_ = ax[0].set_ylabel('The Effect of the Paid Shipping Fees (Logarithm Scale)')
_ = ax[0].set_xlabel('Ordered Categories')
_ = ax[0].set_title('Variation in the Effect of the Paid Shipping Fees by Category', fontsize = 18)

# Bottom
Temp = VSI_Parm.copy()
Temp = Temp[df_search(Temp, 'beta')]
Temp.columns = Category_df['Category_Name']
n = 50

_ = sns.boxplot(ax = ax[1], data = Temp.iloc[:,:n])
_ = ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 90)
_ = ax[1].set_ylim([-2, 2])
_ = ax[1].set_title('The Effect of the Paid Shipping Fees Distributions with respect to Categories', fontsize = 18)
_ = ax[1].set_xlabel('Category Name')
del Temp
In [64]:
fig, ax = plt.subplots(figsize=(7, 6))
t = np.arange(2)
Temp = VSI_Parm.copy()

for a, b in zip(Temp[df_search(Temp, 'alpha')].mean(axis=0), Temp[df_search(Temp, 'beta')].mean(axis=0)):
    ax.plot(t, a + b*t, color='DarkBlue', marker='o', linestyle='--', linewidth=1, markersize=4, alpha=0.4)
delta = 0.1
_ = ax.set_xlim([t[0]-delta, t[1]+delta])
_ = ax.set_ylim([1, 5])
_ = ax.set_xticks([0, 1])
_ = ax.set_xticklabels(['Buyer', 'Seller'])
_ = ax.set_title('Fitted Relationships by Category')
_ = ax.set_xlabel("Shipping Fee by")
_ = ax.set_ylabel(' Estimated Price (Logarithm Scale)')

Prediction

In [65]:
Temp = list_search(Category_df.iloc[:,0].to_list(), 'Styling Products')[0]
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Selected Category:' + Style.RESET_ALL + ' %s' % Temp)
Selected_Category = Category_df.loc[Category_df.Category_Name == Temp, 'Category'].values[0]
del Temp
Selected Category: Beauty/Hair Care/Styling Products
In [66]:
x = Data['Shipping Fee'].values
Categories = len(np.sort(Data['Category'].unique()))
Category = Data['Category']

with Model() as varying_intercept_slope_pred:

    # Priors
    mu_alpha = Normal('mu_alpha', mu = 0 , sigma = 1e5)
    sigma_alpha = HalfCauchy('sigma_alpha', 5)
    
    mu_beta = Normal('mu_beta', mu = 0 , sigma = 1e5)
    sigma_beta = HalfCauchy('sigma_beta', 5)
    
    # Random intercepts
    alpha = Normal('alpha', mu=mu_alpha, sigma=sigma_alpha, shape = Categories)
    
    # Random slopes
    beta = Normal('beta', mu=mu_beta, sigma=sigma_beta, shape = Categories)

    # Model error
    sigma_y = Uniform('sigma_y', lower=0, upper=100)

    # Expected value
    y_hat = alpha[Category] + beta[Category] * x

    # Data likelihood
    y_like = Normal('y_like', mu=y_hat, sigma=sigma_y, observed = Data['Price_Log'].values)
    
    # prediction
    y_pred = Normal('y_pred', mu=alpha[Selected_Category] + beta[Selected_Category], sigma=sigma_y)

display(model_to_graphviz(varying_intercept_slope_pred))
%3 cluster708 708 cluster14,762 14,762 sigma_alpha sigma_alpha ~ HalfCauchy alpha alpha ~ Normal sigma_alpha->alpha sigma_y sigma_y ~ Uniform y_pred y_pred ~ Normal sigma_y->y_pred y_like y_like ~ Normal sigma_y->y_like sigma_beta sigma_beta ~ HalfCauchy beta beta ~ Normal sigma_beta->beta mu_beta mu_beta ~ Normal mu_beta->beta mu_alpha mu_alpha ~ Normal mu_alpha->alpha alpha->y_pred alpha->y_like beta->y_pred beta->y_like
In [67]:
with varying_intercept_slope_pred:
    varying_intercept_slope_pred_trace = sample(1000, tune=1000)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [y_pred, sigma_y, beta, alpha, sigma_beta, mu_beta, sigma_alpha, mu_alpha]
Sampling 4 chains, 0 divergences: 100%|█████████████████████████████████████████| 8000/8000 [03:14<00:00, 41.04draws/s]
The number of effective samples is smaller than 10% for some parameters.
In [68]:
_ = az.plot_trace(varying_intercept_slope_pred_trace, var_names = ['y_pred'], compact = True, rug = True , figsize = (14, 4)) 
_ = az.plot_posterior(varying_intercept_slope_pred_trace, var_names = ['y_pred'], point_estimate = 'mean', figsize = (7, 4))
In [69]:
fig, ax = plt.subplots(1, 2, figsize=(16, 6.5))
ax = ax.ravel()

Temp0 = varying_intercept_slope_pred_trace['y_pred']

_ = ax[0].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'SkyBlue', edgecolors = 'Navy')
_ = ax[0].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
                 color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[0].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
                                                        'Category_Name'].values[0], fontsize = 16)
_ = ax[0].grid(True)
_ = ax[0].set_xlim([-1, len(Temp0)+1])
_ = ax[0].set_ylim([-1, 6])
_ = ax[0].set_ylabel('Estimated Price (Logarithm Scale)')
_ = ax[0].legend(bbox_to_anchor=(1, 1), fontsize = 14)


Temp0 = np.exp(Temp0)

_ = ax[1].scatter(x= np.arange(len(Temp0)), y= Temp0, color = 'MediumSeaGreen', edgecolors = 'Navy')
_ = ax[1].hlines(Temp0.mean(), -1, len(Temp0)+1, linestyles='-',
                 color = 'OrangeRed', lw = 3, label = 'mean = %.2f' % Temp0.mean())
_ = ax[1].set_title('Predictions: %s' % Category_df.loc[Category_df.Category == Selected_Category,
                                                        'Category_Name'].values[0], fontsize = 16)
_ = ax[1].grid(True)
_ = ax[1].set_xlim([-1, len(Temp0)+1])
_ = ax[1].set_ylim([0, 160])
_ = ax[1].set_ylabel('Estimated Price')
_ = ax[1].legend(bbox_to_anchor=(1, 1), fontsize = 14)

del Temp0
In [70]:
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Exact:')
display(Data.loc[Data.Category == Selected_Category, ['Price', 'Price_Log']].describe().T)
Exact:
count mean std min 25% 50% 75% max
Price 28.0 17.857143 10.582005 4.000000 8.750000 17.500000 24.000000 40.000000
Price_Log 28.0 2.692309 0.674433 1.410987 2.179172 2.867495 3.182212 3.691376